Take Home Ex 3

Author

Aruiana

Published

February 5, 2023

Modified

February 14, 2023

1. The Task

To uncover the salient patterns of the resale prices of public housing property by residential towns and estates in Singapore by using appropriate analytical visualisation techniques l

For the purpose of this study, the focus in on 3-ROOM, 4-ROOM and 5-ROOM types in 2022.

2. Data Preparation

##Step 1: Load Packages

pacman::p_load(tidyverse, plotly, crosstalk, DT, ggdist, gganimate, ggstatsplot, heatmaply)

##Step 2: Import Data

#import data
HDB <- read_csv(("data/HDB.csv"))

##Step3: Filter Data for the study

Filter out the data required: 1. Room Type 2. Year 2022

#Filter 3Room, 4Room, 5Room, Filter 2022, Convert remaining lease into years
HDBRoom <- HDB %>% filter(flat_type=="3 ROOM" | flat_type=="4 ROOM" | flat_type=="5 ROOM") %>%
  separate(month, into = c("year", "month")) %>% 
  filter(year == "2022") %>%
  separate(remaining_lease, into = c("rmlease_years", "rmlease_month"), sep = "years") 

##Step 4: Amend Data Set 1. Convert the Month from Character to Number 2. Convert Remaining lease from Character to Number 3. Re-categorise towns into regions 4. Sort Storey Range by smallest to largest 5. Create new dataset for price/sqm

#Convert Month from Chr to number
HDBRoom$month <- as.numeric(HDBRoom$month)

#Convert Remaining lease into numeric years in decimal
HDBRoom$rmlease_years <- as.numeric(HDBRoom$rmlease_years)

HDBRoom$rmlease_month <- gsub("[monthsmonth]", " ", HDBRoom$rmlease_month) %>%
  as.numeric(HDBRoom$rmlease_month) / 12 

HDBRoom$rmlease_month[is.na(HDBRoom$rmlease_month)] = 0

HDBRoom$rmlease <- as.numeric(HDBRoom$rmlease_years + HDBRoom$rmlease_month)

#Group Towns into Regions
HDBRoom$region <- case_when(
  HDBRoom$town %in% c("ANG MO KIO", "HOUGANG", "PUNGGOL", "SERANGOON", "SENGKANG") ~ "North-East",
    HDBRoom$town %in% c("BISHAN", "BUKIT MERAH", "BUKIT TIMAH", "CENTRAL AREA", "GEYLANG", "KALLANG/WHAMPOA", "MARINE PARADE", "QUEENSTOWN", "TOA PAYOH") ~ "Central",
    HDBRoom$town %in% c("BEDOK", "PASIR RIS", "TAMPINES") ~ "East",
    HDBRoom$town %in% c("SEMBAWANG", "WOODLANDS", "YISHUN") ~ "North",
    HDBRoom$town %in% c("BUKIT BATOK", "BUKIT PANJANG", "CHOA CHU KANG", "CLEMENTI", "JURONG EAST", "JURONG WEST") ~ "West")


#Edit storey range and sort by smallest
HDBRoom$storey_range <- str_replace(HDBRoom$storey_range, "TO", "-")

sr_sort = c("01 - 03", "04 - 06", "07 - 09", "10 - 12", "13 - 15", "16 - 18", "19 - 21", "22 - 24","25 - 27","28 - 30", "31 - 33", "34 - 36", "37 - 39", "40 - 42", "43 - 45", "49 - 51", "46 - 48")

HDBRoom$storey_range <- factor (HDBRoom$storey_range, levels = sr_sort)

#Create additional data on price per sqm
HDBRoom$price_per_sqm <- (HDBRoom$resale_price / HDBRoom$floor_area_sqm)

##Step 5: Select the relevant columns for analysis

HDBDATA <- HDBRoom [,!names(HDBRoom) %in% c("year", "block", "street_name", "rmlease_years", "rmlease_month", "flat_model")]
gghistostats(
  data = HDBDATA, x = "floor_area_sqm",
  type = "bayes",
  test.value = 100,
  xlab = "Floor Area (sqm) of property sold"
  )

ggbetweenstats(
  data = HDBDATA,
  x = flat_type, 
  y = resale_price,
  type = "np",
  messages = FALSE
)

ggscatterstats(
  data = HDBDATA,
  x = resale_price,
  y = price_per_sqm,
  marginal = FALSE,
  )

options(scipen = 999)
mean(HDBDATA$resale_price)
[1] 536391.2
min(HDBDATA$resale_price)
[1] 200000
max(HDBDATA$resale_price)
[1] 1418000
mean(HDBDATA$price_per_sqm)
[1] 5735.973
min(HDBDATA$price_per_sqm)
[1] 3333.333
max(HDBDATA$price_per_sqm)
[1] 14731.18
scdata <- highlight_key(HDBDATA) 
  
sc1 <- ggplot(data = scdata, aes(x = town, y = resale_price, fill = region)) + geom_point() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + scale_y_continuous(breaks = c(200000,500000,1000000,150000)) +
  labs(title = "Resale Price by Town", x = "Town", y = "Resale Price")

sc2 <- ggplot(data = scdata, aes(x = town, y = price_per_sqm, fill = region)) + geom_point() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + scale_y_continuous(breaks = c(3000,6000,9000,12000,15000)) +
    labs(title = "Resale Price per sqm by Town", x = "Town", y = "Resale Price/Sqm")

subplot(ggplotly(sc1), ggplotly(sc2))
HDBDATA %>%
  mutate(class = fct_reorder(town, price_per_sqm, .fun="mean")) %>%
  ggplot(aes(y =reorder(town, price_per_sqm),
           x = price_per_sqm, fill = region)) + 
  geom_boxplot() + stat_summary(fun.y=mean, geom = "point", colour="yellow")

HDBDATA %>% 
  group_by(region) %>%
  mutate(class = fct_reorder(region, price_per_sqm, .fun="mean")) %>%
  ggplot(mapping = aes(y = flat_type, x = price_per_sqm)) +
  # Make grouped boxplot
  geom_boxplot(aes(fill = as.factor(region))) +
  theme(legend.position = "top") +
  # Adjust lables and add title
  labs(title = "HDB resale prices in 2022 by region", y="Flat Type", x = "Price per square metre (SGD)", fill = "flat_type")

HDBDATA %>%
  
grouped_gghistostats(
  x                 = resale_price,
  test.value        = 50,
  type              = "nonparametric",
  grouping.var      = region,
  normal.curve      = TRUE,
  normal.curve.args = list(color = "red", size = 1),
  ggtheme           = ggthemes::theme_tufte(),
  ## modify the defaults from `{ggstatsplot}` for each plot
  plotgrid.args     = list(nrow = 2),
  annotation.args   = list(title = "Resale price by region")
)

floorheatmap <-
  HDBDATA %>%
  group_by(town, storey_range) %>%
  summarise(median_price = median(price_per_sqm))

heatmap <- ggplot(data = floorheatmap, 
                  mapping = aes(x = town, y = storey_range, fill = median_price)) +
            geom_tile() +
  labs(title = "Heatmap of HDB breakdown by area and storey", x = "Town", y = "Storey") +
  scale_fill_gradient(name = "Median Resale Price/sqm",
                      low = "peachpuff",
                      high = "deeppink4")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))

heatmap

a <-
ggplot(HDBDATA, aes(x = rmlease, y = resale_price, 
                      size = floor_area_sqm, 
                      colour = region)) +
  geom_point(alpha = 0.7, 
             show.legend = FALSE) +
  scale_size(range = c(2, 12)) +
  labs(title = '2022: {as.integer(frame_time)} Month', 
       x = 'Remaining Lease', 
       y = 'Resale Price') +
  transition_time(month) +       #<<
  ease_aes('linear')            #<<

a